{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert Columns to categoricals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from numpy import nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col1</th>\n",
       "      <th>col2</th>\n",
       "      <th>col3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2.0</td>\n",
       "      <td>a</td>\n",
       "      <td>2020-01-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>b</td>\n",
       "      <td>2020-01-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>c</td>\n",
       "      <td>2020-01-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>d</td>\n",
       "      <td>2020-01-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>a</td>\n",
       "      <td>2020-01-05</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   col1 col2        col3\n",
       "0   2.0    a  2020-01-01\n",
       "1   1.0    b  2020-01-02\n",
       "2   3.0    c  2020-01-03\n",
       "3   1.0    d  2020-01-04\n",
       "4   NaN    a  2020-01-05"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(\n",
    "    {\n",
    "        \"col1\": [2.0, 1.0, 3.0, 1.0, nan],\n",
    "        \"col2\": [\"a\", \"b\", \"c\", \"d\", \"a\"],\n",
    "        \"col3\": [\"2020-01-01\", \"2020-01-02\", \"2020-01-03\", \"2020-01-04\", \"2020-01-05\"],\n",
    "    }\n",
    ")\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "col1    float64\n",
       "col2     object\n",
       "col3     object\n",
       "dtype: object"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Specific columns can be converted to category type:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat = df.encode_categorical(column_names=[\"col1\", \"col2\", \"col3\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "col1    category\n",
       "col2    category\n",
       "col3    category\n",
       "dtype: object"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that for the code above, the categories were inferred from the columns, and is unordered:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    2020-01-01\n",
       "1    2020-01-02\n",
       "2    2020-01-03\n",
       "3    2020-01-04\n",
       "4    2020-01-05\n",
       "Name: col3, dtype: category\n",
       "Categories (5, object): ['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat[\"col3\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Explicit categories can be provided, and ordered via the `kwargs`` parameter:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat = df.encode_categorical(\n",
    "    col1=([3, 2, 1, 4], \"appearance\"), col2=([\"a\", \"d\", \"c\", \"b\"], \"sort\")\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      2\n",
       "1      1\n",
       "2      3\n",
       "3      1\n",
       "4    NaN\n",
       "Name: col1, dtype: category\n",
       "Categories (4, int64): [3 < 2 < 1 < 4]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat[\"col1\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    b\n",
       "2    c\n",
       "3    d\n",
       "4    a\n",
       "Name: col2, dtype: category\n",
       "Categories (4, object): ['a' < 'd' < 'c' < 'b']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat[\"col2\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When the `order` parameter is `appearance`, the `categories` argument is used as-is; if the `order` is `sort`, the `categories` argument is sorted in ascending order; if `order` is `None``, then the `categories` argument is applied unordered."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A User Warning will be generated if some or all of the unique values in the column are not present in the provided `categories` argument."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspaces/pyjanitor/janitor/functions/encode_categorical.py:131: UserWarning: \n",
      "                     None of the values in col1 are in\n",
      "                     [4, 5, 6];\n",
      "                     this might create nulls for all values\n",
      "                     in the new categorical column.\n",
      "                     \n",
      "  categories_dict = _as_categorical_checks(df, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "cat = df.encode_categorical(col1=([4, 5, 6], \"appearance\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    NaN\n",
       "1    NaN\n",
       "2    NaN\n",
       "3    NaN\n",
       "4    NaN\n",
       "Name: col1, dtype: category\n",
       "Categories (3, int64): [4 < 5 < 6]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat[\"col1\"]"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
  },
  "kernelspec": {
   "display_name": "Python 3.8.10 64-bit ('base': conda)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
